{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "#频繁项集：经常一起出现的物品集合\n",
    "#关联规则：两种物品存在的某种很强的关系\n",
    "\n",
    "#支持度support:数据集中包含该项集的记录所占总交易记录的比例\n",
    "#最小支持度：min_support,最小的比例，表示一种下界\n",
    "\n",
    "#可信度：confidence,针对某种关联规则{A}——>{B}的可信度=support({A,B}) / support({A})\n",
    "#最小可信度：min_confidence,最小的可信度\n",
    "\n",
    "#Apriori作用：减少可能感兴趣的项集，避免物品很多时，需要计算的项集呈指数增长，N种物品，(2**N-1)种的项集组合\n",
    "#Apriori原理：如果某个项集是频繁的，那么他的所有子集也是频繁的\n",
    "#延伸：如果某个项集是非频繁的，那么他的所有超集也是非频繁的\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]\n[frozenset([1]), frozenset([2]), frozenset([3]), frozenset([4]), frozenset([5])]\n[set([1, 3, 4]), set([2, 3, 5]), set([1, 2, 3, 5]), set([2, 5])]\n"
     ]
    }
   ],
   "source": [
    "#任务1：发现频繁项集——————利用Apriori,只保留k个元素的频繁项集\n",
    "\n",
    "#交易集合\n",
    "def loadDataSet():\n",
    "    return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]\n",
    "#创建C1，单元素集合，[[1],[2],[3],[4],[5]]\n",
    "def createC1(dataSet):\n",
    "    C1 = []\n",
    "    for transaction in dataSet:\n",
    "        for item in transaction:\n",
    "            if not [item] in C1:\n",
    "                C1.append([item])\n",
    "                \n",
    "    C1.sort()\n",
    "    return map(frozenset, C1)#use frozen set so we\n",
    "    #can use it as a key in a dict \n",
    "    #对C1中的[i]都做frozenset()操作，改成不可变的集合\n",
    "dataSet = loadDataSet()\n",
    "print dataSet\n",
    "C1 = createC1(dataSet)\n",
    "print C1\n",
    "#交易记录\n",
    "D = map(set,dataSet)\n",
    "print D\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])]\n{frozenset([4]): 0.25, frozenset([5]): 0.75, frozenset([2]): 0.75, frozenset([3]): 0.75, frozenset([1]): 0.5}\n"
     ]
    }
   ],
   "source": [
    "#去除交易集合D中，不满足最小支持度minSupport的包含k个元素的项集\n",
    "#得到满足条件的，频繁项集Lk（用list存储）\n",
    "def scanD(D, Ck, minSupport):\n",
    "    ssCnt = {} #字典：{[i]:出现次数}\n",
    "    for tid in D:\n",
    "        for can in Ck:\n",
    "            if can.issubset(tid):#是子集合，[i]出现在某条交易记录中\n",
    "                if not ssCnt.has_key(can): \n",
    "                    ssCnt[can]=1\n",
    "                else: \n",
    "                    ssCnt[can] += 1\n",
    "    numItems = float(len(D)) #float()是为了除法有小数\n",
    "    Lk = [] #返回的满足条件的频繁项集的集合\n",
    "    supportData_k = {}#记录{项集:支持度}\n",
    "    for key in ssCnt: #遍历字典的key\n",
    "        support = ssCnt[key]/numItems\n",
    "        if support >= minSupport:\n",
    "            Lk.insert(0,key) #在列表首添加\n",
    "        supportData_k[key] = support\n",
    "    return Lk, supportData_k\n",
    "\n",
    "#最小支持度=0.5\n",
    "L1,supportData_1 = scanD(D,C1,0.5)\n",
    "print L1\n",
    "print supportData_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[frozenset([1]), frozenset([3]), frozenset([2]), frozenset([5])], [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])], [frozenset([2, 3, 5])], []]\n{frozenset([5]): 0.75, frozenset([3]): 0.75, frozenset([2, 3, 5]): 0.5, frozenset([1, 2]): 0.25, frozenset([1, 5]): 0.25, frozenset([3, 5]): 0.5, frozenset([4]): 0.25, frozenset([2, 3]): 0.5, frozenset([2, 5]): 0.75, frozenset([1]): 0.5, frozenset([1, 3]): 0.5, frozenset([2]): 0.75}\n"
     ]
    }
   ],
   "source": [
    "#频繁项集链表Lk,项集元素的个数k\n",
    "#创建k个元素的候选项集Ck\n",
    "def aprioriGen(Lk, k): #creates Ck\n",
    "    retList = []\n",
    "    lenLk = len(Lk)\n",
    "    for i in range(lenLk):\n",
    "        for j in range(i+1, lenLk): \n",
    "            L1 = list(Lk[i])[:k-2]  #k个元素，求前面(k-2)个元素都相同的集合的并集,k=(k-2)+1+1\n",
    "            L2 = list(Lk[j])[:k-2]\n",
    "            L1.sort()\n",
    "            L2.sort()\n",
    "            if L1==L2: #if first k-2 elements are equal\n",
    "                retList.append(Lk[i] | Lk[j]) #set union并集\n",
    "    return retList\n",
    "\n",
    "\n",
    "def apriori(dataSet, minSupport = 0.5):\n",
    "    C1 = createC1(dataSet) #初始的候选项集C1\n",
    "    D = map(set, dataSet) #交易集合\n",
    "    L1, supportData = scanD(D, C1, minSupport) #初始的频繁项集L1\n",
    "    \n",
    "    L = [L1]\n",
    "    k = 2\n",
    "    while (len(L[k-2]) > 0):\n",
    "        Ck = aprioriGen(L[k-2], k)\n",
    "        Lk, supK = scanD(D, Ck, minSupport)#scan DB to get Lk\n",
    "        supportData.update(supK) #添加字典\n",
    "        L.append(Lk)\n",
    "        k += 1\n",
    "    return L, supportData\n",
    "\n",
    "L,supportData = apriori(dataSet)\n",
    "print L\n",
    "print supportData"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "frozenset([2, 3, 5])\n[frozenset([2]), frozenset([3]), frozenset([5])]\n"
     ]
    }
   ],
   "source": [
    "#任务2：从频繁项集中挖掘关联规则：使用可信度confidence\n",
    "\n",
    "#特点：某条规则不满足最小可信度要求，则该规则左边的所有子集也不会满足最小可信度要求\n",
    "#{0,1,2}——>{3}不满足。则{0,1,2}子集作为左边（前件）的规则也不满足\n",
    "\n",
    "#或者：{0,1,2}——>{3}不满足。则所有包含{3}作为右边（后件）的规则也不满足。(代码采用这个原理)\n",
    "\n",
    "#步骤：首先从一个频繁项集开始，接着创建一个规则列表，其中规则右部（后件）从只有一个元素开始\n",
    "#然后对这些规则进行测试 ，接下来合并符合条件的后件（从右部有1个元素到2个元素，2->3,3->4，\n",
    "# 到频繁项集的大小-1,如此循环，测试规则是否符合,）   P210图11-4\n",
    "            \n",
    "def generateRules(L, supportData, minConf=0.7):  #supportData is a dict coming from scanD\n",
    "    bigRuleList = []\n",
    "    for i in range(1, len(L)):#only get the sets with two or more items\n",
    "        #因为无法从单元素项集中构建关联规则,所以没有下标0\n",
    "        print \"-------------this is in L(%d)------------\"%(i)\n",
    "        print \"--------\",L[i],'-----------'\n",
    "        for freqSet in L[i]:\n",
    "            H1 = [frozenset([item]) for item in freqSet] #单元素集合\n",
    "            #frozenset([1,2])——>[frozenset([1]),frozenset([2])]\n",
    "            if (i > 1): #频繁项集的元素数目>2\n",
    "                rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)\n",
    "            else: #频繁项集中元素=2,直接计算可信度\n",
    "                calcConf(freqSet, H1, supportData, bigRuleList, minConf)\n",
    "    return bigRuleList         \n",
    "\n",
    "#返回一个满足最小可信度的规则列表:yusna\n",
    "def calcConf(freqSet, H, supportData, bigRuleList, minConf=0.7):\n",
    "    print 'the frequent set: ',freqSet,'-----------'\n",
    "    prunedH = [] #create new list to return\n",
    "    for conseq in H:\n",
    "        conf = supportData[freqSet]/supportData[freqSet-conseq] #calc confidence\n",
    "        if conf >= minConf: \n",
    "            print freqSet-conseq,'-->',conseq,'conf:',conf\n",
    "            bigRuleList.append((freqSet-conseq, conseq, conf)) #元组（前件，后件，可信度）\n",
    "            prunedH.append(conseq)  #保存的是后件\n",
    "    print \"stored the right set:\",prunedH\n",
    "    return prunedH\n",
    "\n",
    "def rulesFromConseq(freqSet, H, supportData, bigRuleList, minConf=0.7):\n",
    "    m = len(H[0]) #Hm的元素个数\n",
    "   \n",
    "    if (len(freqSet) > (m + 1)): #try further merging\n",
    "        Hmp1 = aprioriGen(H, m+1)#create Hm+1 new candidates,H(m+1)个元素集合\n",
    "        Hmp1 = calcConf(freqSet, Hmp1, supportData, bigRuleList, minConf)\n",
    "        if (len(Hmp1) > 1):    #need at least two sets to merge,接着合并\n",
    "            rulesFromConseq(freqSet, Hmp1, supportData, bigRuleList, minConf)\n",
    "            \n",
    "            \n",
    "#测试一些函数\n",
    "for freqSet in L[2]:\n",
    "    print freqSet\n",
    "    H1 = [frozenset([item]) for item in freqSet] #单元素集合\n",
    "    print H1\n",
    "    \n",
    "            "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-------------this is in L(1)------------\n-------- [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])] -----------\nthe frequent set:  frozenset([1, 3]) -----------\nfrozenset([1]) --> frozenset([3]) conf: 1.0\nstored the right set: [frozenset([3])]\nthe frequent set:  frozenset([2, 5]) -----------\nfrozenset([5]) --> frozenset([2]) conf: 1.0\nfrozenset([2]) --> frozenset([5]) conf: 1.0\nstored the right set: [frozenset([2]), frozenset([5])]\nthe frequent set:  frozenset([2, 3]) -----------\nstored the right set: []\nthe frequent set:  frozenset([3, 5]) -----------\nstored the right set: []\n-------------this is in L(2)------------\n-------- [frozenset([2, 3, 5])] -----------\nthis is Hmp1 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])] -----------\nthe frequent set:  frozenset([2, 3, 5]) -----------\nstored the right set: []\n-------------this is in L(3)------------\n-------- [] -----------\n[(frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0)]\n"
     ]
    }
   ],
   "source": [
    "rules = generateRules(L,supportData,minConf=0.7)\n",
    "print rules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-------------this is in L(1)------------\n-------- [frozenset([1, 3]), frozenset([2, 5]), frozenset([2, 3]), frozenset([3, 5])] -----------\nthe frequent set:  frozenset([1, 3]) -----------\nfrozenset([3]) --> frozenset([1]) conf: 0.666666666667\nfrozenset([1]) --> frozenset([3]) conf: 1.0\nstored the right set: [frozenset([1]), frozenset([3])]\nthe frequent set:  frozenset([2, 5]) -----------\nfrozenset([5]) --> frozenset([2]) conf: 1.0\nfrozenset([2]) --> frozenset([5]) conf: 1.0\nstored the right set: [frozenset([2]), frozenset([5])]\nthe frequent set:  frozenset([2, 3]) -----------\nfrozenset([3]) --> frozenset([2]) conf: 0.666666666667\nfrozenset([2]) --> frozenset([3]) conf: 0.666666666667\nstored the right set: [frozenset([2]), frozenset([3])]\nthe frequent set:  frozenset([3, 5]) -----------\nfrozenset([5]) --> frozenset([3]) conf: 0.666666666667\nfrozenset([3]) --> frozenset([5]) conf: 0.666666666667\nstored the right set: [frozenset([3]), frozenset([5])]\n-------------this is in L(2)------------\n-------- [frozenset([2, 3, 5])] -----------\nthis is Hmp1 [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])] -----------\nthe frequent set:  frozenset([2, 3, 5]) -----------\nfrozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667\nfrozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667\nfrozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667\nstored the right set: [frozenset([2, 3]), frozenset([2, 5]), frozenset([3, 5])]\n-------------this is in L(3)------------\n-------- [] -----------\n[(frozenset([3]), frozenset([1]), 0.6666666666666666), (frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0), (frozenset([3]), frozenset([2]), 0.6666666666666666), (frozenset([2]), frozenset([3]), 0.6666666666666666), (frozenset([5]), frozenset([3]), 0.6666666666666666), (frozenset([3]), frozenset([5]), 0.6666666666666666), (frozenset([5]), frozenset([2, 3]), 0.6666666666666666), (frozenset([3]), frozenset([2, 5]), 0.6666666666666666), (frozenset([2]), frozenset([3, 5]), 0.6666666666666666)]\n"
     ]
    }
   ],
   "source": [
    "#减小可信度=0.5        \n",
    "rules = generateRules(L,supportData,minConf=0.5)\n",
    "print rules\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "frozenset(['2', '59'])\nfrozenset(['39', '2'])\nfrozenset(['2', '67'])\nfrozenset(['2', '34'])\nfrozenset(['2', '23'])\nfrozenset(['2', '86'])\nfrozenset(['76', '2'])\nfrozenset(['90', '2'])\nfrozenset(['2', '53'])\nfrozenset(['93', '2'])\nfrozenset(['63', '2'])\nfrozenset(['2', '28'])\nfrozenset(['2', '85'])\nfrozenset(['2', '36'])\n"
     ]
    }
   ],
   "source": [
    "#利用Apriori算法，发现毒蘑菇的相似特征：就是毒蘑菇特征的频繁项集\n",
    "\n",
    "mushDatSet = [line.split() for line in open('apriori/mushroom.dat').readlines()]\n",
    "L,supportData = apriori(mushDatSet,0.3)\n",
    "for item in L[1]:\n",
    "    if item.intersection('2'):#交集\n",
    "        print item\n",
    "        \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "    \n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
